【Day 27】使用 BeautifulSoup4 製作網頁爬蟲

2024 iThome 鐵人賽

DAY 27

Software Development

從一萬元開始交易：收割韭菜三十天，量化交易工具製作系列第 27 篇

16th鐵人賽 python bs4 投資量化交易

二坪山小弱砲

團隊北投溫泉公園的蛞蝓觀察小隊

2024-10-11 22:16:19

111 瀏覽

分享至

考慮接下來要爬沒有 API 的頁面，以應付一些較特殊的狀況。
以下我們試圖透過「PTT股板」來探聽鄉民的風向。

REF

Source Code

寫一個物件拉 Ptt
部分看板會設有年齡限制，以八卦版為例，有針對十八與否進行提問，所以我針對網頁版的入口 www.ptt.cc 進行爬取資料

import requests
import cloudscraper
from bs4 import BeautifulSoup


class PttGossiping:

    def __init__(self):
        payload = {
                'from': '/bbs/Gossiping/index.html',
                'yes': 'yes'
            }
        self.rs = requests.Session()
        res = self.rs.post('https://www.ptt.cc/ask/over18',data=payload)
        #self.scraper = cloudscraper.create_scraper(sess=self.rs)
        #res = self.scraper.post('https://www.ptt.cc/ask/over18',data=payload)
        self.soups = []

    def fetch_article(self, fetch_link=None):
        # set the fetch url then req and soup
        if not fetch_link:
            #fetch_link = 'https://www.ptt.cc/bbs/Gossiping/index.html'
            fetch_link = 'https://www.pttweb.cc/bbs/Gossiping'
        res = self.rs.get(fetch_link)
        soup = BeautifulSoup(res.text, 'html.parser')
        return soup

    def get_next_page_link(self, origin_link):
        original_soup = self.fetch_article(origin_link)
        for row in original_soup.select('.btn-group-paging > .btn'):
            if '上頁' in row.text:
                page_link = row.get('href')
                full_url = f'https://www.ptt.cc{page_link}'
                return full_url

    def fetch_bao_article(self, threshold=None, pages=None, fetch_link=None):
        if not threshold:
            threshold = 80
        if not pages:
            pages = 10

        #if len(self.soups) < 1:
        # page first
        link = fetch_link
        if not fetch_link:
            fetch_link = 'https://www.ptt.cc/bbs/Gossiping/index.html'
        print(f'Going to fetch {fetch_link}')
        soup = self.fetch_article(fetch_link)
        self.soups.append(soup)
        
        
        # additional page 1 ~ N
        for i in range(pages):
            print(f'Fetching page ... {i+1}')
            link = self.get_next_page_link(link)
            soup = self.fetch_article(link)
            self.soups.append(soup)

        #print(self.soups)
        results = []
        for soup in self.soups:
            for row in soup.select('.r-ent'):
                try:
                    rank = row.select('.hl')[0].text
                    title = None
                    link = None
                    if 'X' in rank:
                        continue
                    elif rank == '爆' or (int(rank) >= threshold):
                        title = row.select('.title')[0].text.strip('\n')
                        uri = row.find_all('a')[0]['href']
                        link = 'https://www.ptt.cc' + uri
                        res = {}
                        title = title.replace('\u3000',' ')
                        res['title'] = title
                        res['link'] = link
                        res['rank'] = rank
                        results.append(res)
                        #print(res)
                except IndexError:
                    #print('empty row ...')
                    pass
                except Exception as e:
                    print(e)
        return results